In [2]:
cd ..


/afs/inf.ed.ac.uk/user/s13/s1320903/Neuroglycerin/neukrill-net-work

In [3]:
import pylearn2.utils
import pylearn2.config
import theano
import neukrill_net.utils
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import holoviews as hl
%load_ext holoviews.ipython
import sklearn.metrics
import pickle
import neukrill_net.utils
import neukrill_net.encoding as enc
import neukrill_net.taxonomy as t


Using gpu device 2: Tesla K40c
:0: FutureWarning: IPython widgets are experimental and may change in the future.
Welcome to the HoloViews IPython extension! (http://ioam.github.io/holoviews/)
Available magics: %compositor, %opts, %params, %view, %%labels, %%opts, %%view
<matplotlib.figure.Figure at 0x7f81d805f850>
<matplotlib.figure.Figure at 0x7f81d805ff50>
<matplotlib.figure.Figure at 0x7f81d805fd50>

In [6]:
%env THEANO_FLAGS = device=gpu0,floatX=float32,base_compiledir=~/.theano/stonesoup0


env: THEANO_FLAGS=device=gpu0,floatX=float32,base_compiledir=~/.theano/stonesoup0

In [4]:
settings = neukrill_net.utils.Settings("settings.json")
run_settings = neukrill_net.utils.load_run_settings('run_settings/experiment_setting_colnorms_higher_aug.json', 
            settings, force=True)
model = pylearn2.utils.serial.load(run_settings['pickle abspath'])

In [7]:
# format the YAML
yaml_string = neukrill_net.utils.format_yaml(run_settings, settings)
# load proxied objects
proxied = pylearn2.config.yaml_parse.load(yaml_string, instantiate=False)
# pull out proxied dataset
proxdata = proxied.keywords['dataset']
# force loading of dataset and switch to test dataset
proxdata.keywords['force'] = True
proxdata.keywords['training_set_mode'] = 'test'
proxdata.keywords['verbose'] = False
# then instantiate the dataset
dataset = pylearn2.config.yaml_parse._instantiate(proxdata)

In [8]:
if hasattr(dataset.X, 'shape'):
    N_examples = dataset.X.shape[0]
else:
    N_examples = len(dataset.X)
batch_size = 500
while N_examples%batch_size != 0:
    batch_size += 1
n_batches = int(N_examples/batch_size)
#n_classes = len(settings.classes)

In [9]:
model.set_batch_size(batch_size)
X = model.get_input_space().make_batch_theano()
Y = model.fprop(X)
if type(X) == tuple:
    f = theano.function(X,Y)
else:
    f = theano.function([X],Y)

In [10]:
augment = 1
y = np.zeros((N_examples*augment,188))
# get the data specs from the cost function using the model
pcost = proxied.keywords['algorithm'].keywords['cost']
cost = pylearn2.config.yaml_parse._instantiate(pcost)
data_specs = cost.get_data_specs(model)
i = 0

In [11]:
for _ in range(augment):
    # make sequential iterator
    iterator = dataset.iterator(batch_size=batch_size,num_batches=n_batches,
                        mode='even_sequential', data_specs=data_specs)
    for batch in iterator:
        if type(X) == tuple:
            y[i*batch_size:(i+1)*batch_size,:] = f(batch[0],batch[1])
        else:
            y[i*batch_size:(i+1)*batch_size,:] = f(batch[0])
        i += 1

In [12]:
af = run_settings.get("augmentation_factor",1)
if af > 1:
    y_collapsed = np.zeros((int(N_examples/af), 188)) 
    for i,(low,high) in enumerate(zip(range(0,dataset.y.shape[0],af),
                                range(af,dataset.y.shape[0]+af,af))):
        y_collapsed[i,:] = np.mean(y[low:high,:], axis=0)
    y = y_collapsed
    # and collapse labels
    labels = dataset.y[range(0,dataset.y.shape[0],af)]
elif augment > 1:
    y_collapsed = np.zeros((N_examples,188))
    # different kind of augmentation, has to be collapsed differently
    for row in range(N_examples):
        y_collapsed[row,:] = np.mean(np.vstack([r for r in 
            y[[i for i in range(row,N_examples*augment,N_examples)],:]]), 
            axis=0)
    y = y_collapsed            
    labels = dataset.y
else:
    labels = dataset.y

In [196]:
y.shape


Out[196]:
(3089, 2)

In [115]:
predictions = np.zeros(y.shape)
np.copyto(predictions, y)
predictions.shape


Out[115]:
(3089, 188)

Predict with the classes.


In [29]:
class_predictions = np.zeros((y.shape[0], 121))
np.copyto(class_predictions, predictions[:, :121])
labels = labels[:, :121]
logloss = sklearn.metrics.log_loss(labels,class_predictions)
print("Log loss: {0}".format(logloss))


Log loss: 0.935334935387

Now let's try with superclasses.


In [117]:
superclasses = np.zeros((y.shape[0], 38))
np.copyto(superclasses, predictions[:,121:(121+38)])
superclasses.shape


Out[117]:
(3089, 38)

In [31]:
import neukrill_net.taxonomy as t
only_leaf_children = []
hier = enc.get_hierarchy(settings)

layer = t.TaxonomyLayer(1)

for s in hier[1]:
    flag = True
    for key, values in t.superclasses.items():
        for i, v in enumerate(values):
            if v == s and i != 1:
                flag = False
    if flag:
        if s not in only_leaf_children:
            only_leaf_children.append(s)

In [32]:
only_leaf_children


Out[32]:
['acantharia',
 'appendicularians',
 'calanoid',
 'chaetognaths',
 'cydippid',
 'decapods_all',
 'detritus',
 'diatoms',
 'euphausiids_all_ages',
 'fish',
 'no_class',
 'oithona',
 'other_hydromedusae',
 'physonect',
 'pluteus',
 'pteropods',
 'radiolarian',
 'rocketship',
 'seastar',
 'sphaeronectes',
 'sub_hydromedusae1',
 'sub_hydromedusae2',
 'sub_protists',
 'trichodesmium',
 'tunicate',
 'unknown']

In [33]:
import neukrill_net.taxonomy as t
layer = t.TaxonomyLayer(1)

backmap = {}
for i, c in enumerate(settings.classes):
    j = int(np.where(np.array(hier[1]) == layer[c])[0])
    if hier[1][j] in only_leaf_children:
        try:
            backmap[j] += [i]
        except:
            backmap[j] = [i]

In [34]:
backmap


Out[34]:
{0: [0, 1, 2],
 1: [4, 5, 6, 7],
 2: [14, 15, 16, 17, 18, 19, 20, 21, 22, 26],
 4: [10, 11, 12],
 8: [29, 30],
 9: [32, 92, 94, 95],
 10: [33, 34, 35, 49],
 11: [36, 37],
 13: [47, 48],
 14: [50, 51, 52, 53, 54, 55],
 17: [8, 9],
 18: [24, 25],
 19: [58, 59, 66, 67, 68, 69, 70, 71, 74, 75, 76, 77],
 21: [104, 105],
 23: [38, 39, 40, 41, 45],
 25: [87, 88, 89],
 26: [90, 91],
 27: [97, 98],
 28: [42, 43],
 31: [99, 100, 101],
 32: [57, 60, 61, 62],
 33: [63, 64, 65, 72, 73],
 34: [82, 83, 85, 86],
 35: [108, 109, 110, 111],
 36: [113, 114, 115, 116, 117],
 37: [118, 119, 120]}

In [35]:
nr = np.zeros(class_predictions.shape)
np.copyto(nr, class_predictions)
weight = 0.01

for index,r in enumerate(superclasses):
    if index%1000 == 0:
        print index
    for i,j in enumerate(r):
        priors = []
        if i in backmap:
            d = sum([settings.class_priors[k] for k in backmap[i]])
            for a in backmap[i]:
                priors.append(settings.class_priors[a]/d)
            nr[index,backmap[i]] = nr[index, backmap[i]]*(1-weight) + (weight)*j*np.array(priors)


0
1000
2000
3000

In [36]:
nr.shape


Out[36]:
(3089, 121)

With weight=0 it's the same.


In [18]:
logloss = sklearn.metrics.log_loss(labels,nr)
print("Log loss: {0}".format(logloss))


Log loss: 0.935334935387

With weight=0.01.


In [37]:
logloss = sklearn.metrics.log_loss(labels,nr)
print("Log loss: {0}".format(logloss))


Log loss: 0.933100272368

In [38]:
superclass2_children = {}
settings = neukrill_net.utils.Settings("settings.json")
hier = enc.get_hierarchy(settings)

for s in hier[2]:
    children = []
    for key, values in t.superclasses.items():
        for i, v in enumerate(values):
            if s == v:
                if i > 0:
                    if values[i-1] not in children:
                        children.append(values[i-1])
    superclass2_children[s] = children

In [41]:
hier[1]


Out[41]:
['acantharia',
 'appendicularians',
 'calanoid',
 'calycophoran_siphonophores',
 'chaetognaths',
 'crustaceans',
 'ctenophores',
 'cyclopoid_copepods',
 'cydippid',
 'decapods_all',
 'detritus',
 'diatoms',
 'echinoderm',
 'euphausiids_all_ages',
 'fish',
 'gastropods',
 'gelatinous zooplankton',
 'no_class',
 'oithona',
 'other_hydromedusae',
 'other_invert_larvae',
 'physonect',
 'plankton',
 'pluteus',
 'protists',
 'pteropods',
 'radiolarian',
 'rocketship',
 'seastar',
 'shrimp_like',
 'siphonophores',
 'sphaeronectes',
 'sub_hydromedusae1',
 'sub_hydromedusae2',
 'sub_protists',
 'trichodesmium',
 'tunicate',
 'unknown']

In [39]:
superclass2_children


Out[39]:
{'calycophoran_siphonophores': ['rocketship',
  'sphaeronectes',
  'siphonophore_calycophoran_abylidae'],
 'copepods': ['calanoid', 'cyclopoid_copepods'],
 'crustaceans': ['shrimp_like',
  'copepods',
  'stomatopod',
  'amphipods',
  'crustacean_other'],
 'ctenophores': ['ctenophore_cestid', 'ctenophore_lobate', 'cydippid'],
 'cyclopoid_copepods': ['oithona', 'copepod_cyclopoid_copilia'],
 'echinoderm': ['seastar',
  'echinoderm_seacucumber_auricularia_larva',
  'pluteus'],
 'gastropods': ['pteropods', 'heteropod'],
 'gelatinous zooplankton': ['ctenophores',
  'pelagic_tunicates',
  'hydromedusae',
  'siphonophores',
  'ephyra',
  'jellies_tentacles'],
 'hydromedusae': ['other_hydromedusae',
  'sub_hydromedusae2',
  'sub_hydromedusae1'],
 'no_class': ['artifacts', 'artifacts_edge'],
 'other_invert_larvae': ['echinoderm',
  'tornaria_acorn_worm_larvae',
  'trochophore_larvae',
  'invertebrate_larvae_other_A',
  'invertebrate_larvae_other_B'],
 'pelagic_tunicates': ['appendicularians', 'tunicate'],
 'plankton': ['other_invert_larvae',
  'unknown',
  'crustaceans',
  'gastropods',
  'gelatinous zooplankton',
  'detritus',
  'chaetognaths',
  'protists',
  'chordate_type1',
  'trichodesmium',
  'fish',
  'diatoms',
  'polychaete'],
 'protists': ['acantharia',
  'radiolarian',
  'sub_protists',
  'protist_noctiluca'],
 'shrimp_like': ['decapods_all', 'shrimp-like_other', 'euphausiids_all_ages'],
 'siphonophores': ['calycophoran_siphonophores',
  'physonect',
  'siphonophore_other_parts',
  'siphonophore_partial']}

In [ ]:
layer = t.TaxonomyLayer(1)
weight = 0.01

for index, row in enumerate(nr):
    for key, value in superclass2_children.items():
        proportion = []
        leaf = []
        for i,v in enumerate(value):
            if v in only_leaf_children:
                j = int(np.where(np.array(hier[1]) == v)[0])
                proportion.append(superclasses[index, j])
            else:
                if v in settings.classes:
                    leaf.append(i) 
        for l in leaf:
            denom = sum(proportion)
            proportion = proportion/denom
            col = int(np.where(np.array(hier[2]) == key)[0])
            row[i] = (superclasses2[row, col] - proportion)
    if index%1000 == 0:
    print index

Log loss plot


In [37]:
N = y.shape[0]

In [47]:
logloss = lambda x: -(1./N)*np.log(x[0][x[1]])

In [48]:
ilabels = np.where(labels)[1]

In [49]:
ll = []
for i,(p,l) in enumerate(zip(y,ilabels)):
    ll.append((i,logloss((p,l))))

In [52]:
h=plt.hist(zip(*ll)[1], bins=50)



In [56]:
worst = int(np.where(np.array(ll)[:,1]==max(np.array(ll)[:,1]))[0])
worst


Out[56]:
1512

In [59]:
ll[1512]


Out[59]:
(1512, 0.0050517058948519487)

What does the worst log loss correspond to?


In [58]:
settings.classes[ilabels[worst]]


Out[58]:
u'echinoderm_seacucumber_auricularia_larva'

In [64]:
misclass = int(np.where(y[worst,:121] == max(y[worst,:121]))[0])
misclass


Out[64]:
35

It was misclassified as:


In [65]:
settings.classes[misclass]


Out[65]:
u'detritus_other'

Looks like this:


In [60]:
hl.Image(dataset.X[worst])


Out[60]:

In [70]:
for d in np.where(ilabels == misclass)[0][:10]:
    try:
        c += hl.Image(dataset.X[d])
    except:
        c = hl.Image(dataset.X[d])
c


Out[70]:

In [71]:
hier[2]


Out[71]:
['calycophoran_siphonophores',
 'copepods',
 'crustaceans',
 'ctenophores',
 'cyclopoid_copepods',
 'echinoderm',
 'gastropods',
 'gelatinous zooplankton',
 'hydromedusae',
 'no_class',
 'other_invert_larvae',
 'pelagic_tunicates',
 'plankton',
 'protists',
 'shrimp_like',
 'siphonophores']

In [74]:
np.copyto(predictions, y)
y.shape


Out[74]:
(3089, 188)

In [109]:
%env THEANO_FLAGS = 'device=gpu3,floatX=float32,base_compiledir=~/.theano/stonesoup3'


env: THEANO_FLAGS='device=gpu3,floatX=float32,base_compiledir=~/.theano/stonesoup3'

In [111]:
labels = labels[:, :121]
logloss = sklearn.metrics.log_loss(labels,predictions[:, :121])
print("Log loss: {0}".format(logloss))


Log loss: 0.935334935387

In [116]:
superclasses2 = predictions[:,(121+38):(121+38+16)]
superclasses2.shape


Out[116]:
(3089, 16)

Get children of each second parent superclass.


In [42]:
superclass2_children = {}
settings = neukrill_net.utils.Settings("settings.json")
hier = enc.get_hierarchy(settings)

for s in hier[2]:
    children = []
    for key, values in t.superclasses.items():
        for i, v in enumerate(values):
            if s == v:
                if i > 0:
                    if values[i-1] not in children:
                        children.append(values[i-1])
    superclass2_children[s] = children

In [43]:
superclass2_children


Out[43]:
{'calycophoran_siphonophores': ['rocketship',
  'sphaeronectes',
  'siphonophore_calycophoran_abylidae'],
 'copepods': ['calanoid', 'cyclopoid_copepods'],
 'crustaceans': ['shrimp_like',
  'copepods',
  'stomatopod',
  'amphipods',
  'crustacean_other'],
 'ctenophores': ['ctenophore_cestid', 'ctenophore_lobate', 'cydippid'],
 'cyclopoid_copepods': ['oithona', 'copepod_cyclopoid_copilia'],
 'echinoderm': ['seastar',
  'echinoderm_seacucumber_auricularia_larva',
  'pluteus'],
 'gastropods': ['pteropods', 'heteropod'],
 'gelatinous zooplankton': ['ctenophores',
  'pelagic_tunicates',
  'hydromedusae',
  'siphonophores',
  'ephyra',
  'jellies_tentacles'],
 'hydromedusae': ['other_hydromedusae',
  'sub_hydromedusae2',
  'sub_hydromedusae1'],
 'no_class': ['artifacts', 'artifacts_edge'],
 'other_invert_larvae': ['echinoderm',
  'tornaria_acorn_worm_larvae',
  'trochophore_larvae',
  'invertebrate_larvae_other_A',
  'invertebrate_larvae_other_B'],
 'pelagic_tunicates': ['appendicularians', 'tunicate'],
 'plankton': ['other_invert_larvae',
  'unknown',
  'crustaceans',
  'gastropods',
  'gelatinous zooplankton',
  'detritus',
  'chaetognaths',
  'protists',
  'chordate_type1',
  'trichodesmium',
  'fish',
  'diatoms',
  'polychaete'],
 'protists': ['acantharia',
  'radiolarian',
  'sub_protists',
  'protist_noctiluca'],
 'shrimp_like': ['decapods_all', 'shrimp-like_other', 'euphausiids_all_ages'],
 'siphonophores': ['calycophoran_siphonophores',
  'physonect',
  'siphonophore_other_parts',
  'siphonophore_partial']}

In [72]:
nr = class_predictions
weight = 0.01
nr.shape


Out[72]:
(3089, 121)

In [148]:
layer = t.TaxonomyLayer(1)
row = nr[1,:]
weight = 0.01

for key, value in superclass2_children.items():
    for i,v in enumerate(value):
        if v in only_leaf_children:
            update = {}
            for i, c in enumerate(settings.classes):
                if v == layer[c]:
                    print v
                    update[i] = settings.class_priors[i]
            denom = sum(v for k,v in update.items())
            indUpdate = [k for k,v in update.items()]
            print row[indUpdate]
            row[indUpdate] = row[indUpdate] * (1-weight) + [v for k,v in update.items()/denom * weight]
            print row[indUpdate]


acantharia
acantharia
acantharia
[  9.94898095e-01   8.29245158e-04   4.27219143e-03]
[  9.94085805e-01   9.54560106e-04   4.95917146e-03]
radiolarian
radiolarian
[ 0.03774202  0.02077784]
[ 0.04381403  0.02412063]
sub_protists
sub_protists
sub_protists
sub_protists
[ 0.00358086  0.01233393  0.03885892  0.0037466 ]
[ 0.00415695  0.01431824  0.04511056  0.00434936]
calanoid
calanoid
calanoid
calanoid
calanoid
calanoid
calanoid
calanoid
calanoid
calanoid
[ 0.02286404  0.00580834  0.00322312  0.00597621  0.00211518  0.00960222
  0.00355887  0.00164514  0.00292096  0.00080578]
[ 0.02654245  0.0067428   0.00374167  0.00693768  0.00245547  0.01114705
  0.00413142  0.00190981  0.00339089  0.00093542]
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
other_hydromedusae
[ 0.00050232  0.00795334  0.01724619  0.01146956  0.00627895  0.00318134
  0.00179997  0.00234414  0.00058604  0.00255344  0.00313948  0.00146509]
[ 0.00058313  0.00923289  0.0200208   0.0133148   0.00728913  0.00369316
  0.00208955  0.00272127  0.00068032  0.00296424  0.00364456  0.0017008 ]
sub_hydromedusae2
sub_hydromedusae2
sub_hydromedusae2
sub_hydromedusae2
sub_hydromedusae2
[ 0.00586532  0.01492989  0.00546541  0.03123725  0.00102199]
[ 0.00680894  0.01733185  0.00634469  0.03626276  0.00118641]
sub_hydromedusae1
sub_hydromedusae1
sub_hydromedusae1
sub_hydromedusae1
[ 0.01935422  0.03489856  0.00137156  0.00289551]
[ 0.02246797  0.04051311  0.00159222  0.00336135]
pteropods
pteropods
pteropods
[ 0.0040901   0.02045049  0.03397927]
[ 0.00474812  0.0237406   0.03944593]
rocketship
rocketship
[ 0.01278346  0.04573639]
[ 0.01484009  0.05309456]
sphaeronectes
sphaeronectes
sphaeronectes
[ 0.02168748  0.00690607  0.0299263 ]
[ 0.02517661  0.00801713  0.03474091]
oithona
oithona
[ 0.02519605  0.0333238 ]
[ 0.02924964  0.03868501]
cydippid
cydippid
[ 0.02587193  0.03264792]
[ 0.03003427  0.03790038]
appendicularians
appendicularians
appendicularians
appendicularians
[ 0.00063009  0.02095058  0.02740903  0.00953015]
[ 0.00073146  0.02432115  0.03181865  0.01106338]
tunicate
tunicate
tunicate
tunicate
tunicate
[ 0.01693488  0.01608621  0.01357877  0.00910395  0.00281605]
[ 0.0196594   0.01867419  0.01576335  0.01056861  0.0032691 ]
unknown
unknown
unknown
[ 0.02712207  0.02022987  0.01116791]
[ 0.03148553  0.0234845   0.01296463]
detritus
detritus
detritus
detritus
[ 0.00973543  0.01056683  0.0245129   0.01370469]
[ 0.01130169  0.01226684  0.02845659  0.01590954]
chaetognaths
chaetognaths
chaetognaths
[ 0.01385236  0.03287174  0.01179575]
[ 0.01608096  0.03816021  0.01369348]
trichodesmium
trichodesmium
trichodesmium
trichodesmium
[ 0.01211818  0.00092427  0.03387271  0.0116047 ]
[ 0.01406778  0.00107297  0.03932222  0.01347169]
fish
fish
fish
fish
fish
fish
[ 0.00182875  0.00566911  0.01554434  0.0208477   0.01170397  0.00292599]
[ 0.00212296  0.00658117  0.01804514  0.02420172  0.01358693  0.00339673]
diatoms
diatoms
[ 0.0298055   0.02871435]
[ 0.03460067  0.03333398]
seastar
seastar
[ 0.0244627   0.03405716]
[ 0.02839831  0.03953634]
pluteus
pluteus
pluteus
pluteus
pluteus
[ 0.01449412  0.01594349  0.00489175  0.00652234  0.01666819]
[ 0.01682596  0.01850851  0.00567875  0.00757166  0.01934981]
decapods_all
decapods_all
decapods_all
decapods_all
[ 0.00746773  0.00665307  0.02077387  0.02362518]
[ 0.00866916  0.00772343  0.02411601  0.02742605]
euphausiids_all_ages
euphausiids_all_ages
[ 0.0127802   0.04573965]
[ 0.0148363   0.05309835]
physonect
physonect
[ 0.05027209  0.00824776]
[ 0.05835997  0.00957468]

In [157]:
row == predictions[1,:121]


Out[157]:
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True], dtype=bool)

In [62]:
only_leaf_children


Out[62]:
['acantharia',
 'appendicularians',
 'calanoid',
 'chaetognaths',
 'cydippid',
 'decapods_all',
 'detritus',
 'diatoms',
 'euphausiids_all_ages',
 'fish',
 'no_class',
 'oithona',
 'other_hydromedusae',
 'physonect',
 'pluteus',
 'pteropods',
 'radiolarian',
 'rocketship',
 'seastar',
 'sphaeronectes',
 'sub_hydromedusae1',
 'sub_hydromedusae2',
 'sub_protists',
 'trichodesmium',
 'tunicate',
 'unknown']

Matt's way of updating predictions:


In [178]:
layer = t.TaxonomyLayer(1)
hier = enc.get_hierarchy(settings)
priors = np.zeros(len(hier[1]))
superclass_children = [[] for k in range(len(hier[1]))]

# For each class
for i, c in enumerate(settings.classes):
    # Find index of its first parent in the 1-of-k encoding array
    j = int(np.where(np.array(hier[1]) == layer[c])[0])
    # Add class's prior for that superclass group
    priors[j] += settings.class_priors[i]
    # Record index of this class as a child or superclass
    superclass_children[j].append(i)

In [179]:
priors


Out[179]:
array([ 0.0320741 ,  0.0489847 ,  0.05745649,  0.0069884 ,  0.11349552,
        0.00903217,  0.00497758,  0.00098892,  0.00313159,  0.01420754,
        0.07192774,  0.03359045,  0.00316456,  0.00573576,  0.01054852,
        0.00032964,  0.00510944,  0.01855881,  0.06882911,  0.04608386,
        0.00346123,  0.00491166,  0.00685654,  0.01064742,  0.02060258,
        0.00613133,  0.01466904,  0.02037184,  0.03035997,  0.00171414,
        0.00194488,  0.01592168,  0.01265823,  0.04341377,  0.0581817 ,
        0.11270438,  0.05000659,  0.03022811])

In [180]:
superclass_children


Out[180]:
[[0, 1, 2],
 [4, 5, 6, 7],
 [14, 15, 16, 17, 18, 19, 20, 21, 22, 26],
 [96],
 [10, 11, 12],
 [3, 27, 106],
 [28, 31],
 [23],
 [29, 30],
 [32, 92, 94, 95],
 [33, 34, 35, 49],
 [36, 37],
 [44],
 [47, 48],
 [50, 51, 52, 53, 54, 55],
 [56],
 [46, 80],
 [8, 9],
 [24, 25],
 [58, 59, 66, 67, 68, 69, 70, 71, 74, 75, 76, 77],
 [78, 79, 107, 112],
 [104, 105],
 [13, 81],
 [38, 39, 40, 41, 45],
 [84],
 [87, 88, 89],
 [90, 91],
 [97, 98],
 [42, 43],
 [93],
 [102, 103],
 [99, 100, 101],
 [57, 60, 61, 62],
 [63, 64, 65, 72, 73],
 [82, 83, 85, 86],
 [108, 109, 110, 111],
 [113, 114, 115, 116, 117],
 [118, 119, 120]]

In [182]:
new_class_predictions = np.zeros(class_predictions.shape)
new_class_predictions.shape


Out[182]:
(3089, 121)

In [183]:
new_class_predictions = np.zeros(class_predictions.shape)

for index, row in enumerate(class_predictions):
    for i, c in enumerate(settings.classes):
        j = int(np.where(np.array(hier[1]) == layer[c])[0])
        new_class_predictions[index, i] = superclasses[index, j] * (settings.class_priors[i] / priors[j])

In [184]:
new_class_predictions.shape


Out[184]:
(3089, 121)

In [185]:
updated_predictions = np.zeros(class_predictions.shape)
updated_predictions = predictions[:, :121] * (1-weight) + weight * new_class_predictions

In [186]:
labels = labels[:, :121]
logloss = sklearn.metrics.log_loss(labels, predictions[:,:121])
print("Log loss: {0}".format(logloss))


Log loss: 0.935334935387

In [187]:
logloss = sklearn.metrics.log_loss(labels, updated_predictions)
print("Log loss: {0}".format(logloss))


Log loss: 0.951994559234

In [188]:
weight


Out[188]:
0.10000000000000001

In [189]:
results = []
weights = np.logspace(-5,-1, 20)
for weight in weights:
    updated_predictions = np.zeros(121)
    updated_predictions = predictions[:, :121] * (1-weight) + weight * new_class_predictions
    logloss = sklearn.metrics.log_loss(labels, updated_predictions)
    results.append(logloss)

In [190]:
results


Out[190]:
[0.93532807287028685,
 0.93532380674921556,
 0.93531690318819105,
 0.93530575523596571,
 0.93528781447747555,
 0.93525909830576115,
 0.93521352888105802,
 0.93514218153182349,
 0.9350327533300572,
 0.93487000533804321,
 0.93463851464994019,
 0.93432941445079354,
 0.93395248184095403,
 0.93355463437203612,
 0.93324793276374252,
 0.93325625037706783,
 0.93399910962887434,
 0.93624466111408577,
 0.94139048222136512,
 0.95199455923379972]

In [191]:
plt.semilogx(weights, results)


Out[191]:
[<matplotlib.lines.Line2D at 0x7f81b62a7f10>]

In [195]:
predictions = np.zeros(y.shape)
np.copyto(predictions, y)


Out[195]:
(3089, 2)

In [159]:
class_predictions.shape


Out[159]:
(3089, 121)

Make a big list with predictions for each vector.


In [170]:
all_predictions = [[] for k in range(len(hier))]
start = 0

for i in range(len(hier)):
    end = len(hier[i])
    all_predictions[i] = predictions[:, start:(start + end)]
    start = start + end

Check the dimensions are right.


In [173]:
[y.shape for y in all_predictions]


Out[173]:
[(3089, 121), (3089, 38), (3089, 16), (3089, 7), (3089, 4), (3089, 2)]

Make a list for priors for all vectors.


In [152]:
priors = [[] for k in range(len(hier))]

for i in range(len(hier)):
    priors[i] = np.zeros(len(hier[i])) 

priors[0] = settings.class_priors

Set it with class priors in the zeroth vector.


In [153]:
priors


Out[153]:
[array([ 0.02930512,  0.00042853,  0.00234045,  0.00161524,  0.00052743,
         0.01753692,  0.02294304,  0.00797732,  0.01295491,  0.0056039 ,
         0.02686577,  0.06375264,  0.02287711,  0.00253824,  0.02244858,
         0.0057028 ,  0.00316456,  0.00586762,  0.00207674,  0.00942774,
         0.0034942 ,  0.00161524,  0.00286788,  0.00098892,  0.02963476,
         0.03919436,  0.00079114,  0.00662579,  0.00372495,  0.00138449,
         0.0017471 ,  0.00125264,  0.00181303,  0.01196598,  0.01298787,
         0.03012922,  0.01710839,  0.01648207,  0.00118671,  0.0030327 ,
         0.00263713,  0.00290084,  0.01269119,  0.01766878,  0.00316456,
         0.00089003,  0.0004615 ,  0.00448312,  0.00125264,  0.01684467,
         0.00032964,  0.00102189,  0.00280195,  0.00375791,  0.0021097 ,
         0.00052743,  0.00032964,  0.00418645,  0.00247231,  0.00115374,
         0.00754879,  0.00029668,  0.00062632,  0.00075818,  0.00435127,
         0.01107595,  0.00039557,  0.00626319,  0.01358122,  0.00903217,
         0.00494462,  0.00250527,  0.02317379,  0.00405459,  0.00141746,
         0.00184599,  0.0004615 ,  0.00201081,  0.0004615 ,  0.00079114,
         0.00464794,  0.0043183 ,  0.00356013,  0.01226266,  0.02060258,
         0.03863397,  0.00372495,  0.00356013,  0.00042853,  0.00214267,
         0.00946071,  0.00520833,  0.00161524,  0.00171414,  0.00504351,
         0.00573576,  0.0069884 ,  0.00445016,  0.01592168,  0.00590058,
         0.00187896,  0.00814214,  0.00095596,  0.00098892,  0.00421941,
         0.00069225,  0.00079114,  0.00125264,  0.02333861,  0.00178006,
         0.06523602,  0.02234968,  0.00095596,  0.01447126,  0.01374604,
         0.01160338,  0.00777954,  0.00240638,  0.01044963,  0.00576872,
         0.01400976]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([ 0.,  0.,  0.,  0.]),
 array([ 0.,  0.])]

Propagate up to compute superclass group priors in all vectors.


In [154]:
# For each class
for index, h in enumerate(hier):
    if (index < 5):
        layer = t.TaxonomyLayer(index + 1)

        for i, c in enumerate(settings.classes):    
            # Find index of its parent in 1-of-k encodings
            j = int(np.where(np.array(hier[index + 1]) == layer[c])[0])

            # Add class's prior for that superclass group
            priors[index + 1][j] += settings.class_priors[i]

In [155]:
priors


Out[155]:
[array([ 0.02930512,  0.00042853,  0.00234045,  0.00161524,  0.00052743,
         0.01753692,  0.02294304,  0.00797732,  0.01295491,  0.0056039 ,
         0.02686577,  0.06375264,  0.02287711,  0.00253824,  0.02244858,
         0.0057028 ,  0.00316456,  0.00586762,  0.00207674,  0.00942774,
         0.0034942 ,  0.00161524,  0.00286788,  0.00098892,  0.02963476,
         0.03919436,  0.00079114,  0.00662579,  0.00372495,  0.00138449,
         0.0017471 ,  0.00125264,  0.00181303,  0.01196598,  0.01298787,
         0.03012922,  0.01710839,  0.01648207,  0.00118671,  0.0030327 ,
         0.00263713,  0.00290084,  0.01269119,  0.01766878,  0.00316456,
         0.00089003,  0.0004615 ,  0.00448312,  0.00125264,  0.01684467,
         0.00032964,  0.00102189,  0.00280195,  0.00375791,  0.0021097 ,
         0.00052743,  0.00032964,  0.00418645,  0.00247231,  0.00115374,
         0.00754879,  0.00029668,  0.00062632,  0.00075818,  0.00435127,
         0.01107595,  0.00039557,  0.00626319,  0.01358122,  0.00903217,
         0.00494462,  0.00250527,  0.02317379,  0.00405459,  0.00141746,
         0.00184599,  0.0004615 ,  0.00201081,  0.0004615 ,  0.00079114,
         0.00464794,  0.0043183 ,  0.00356013,  0.01226266,  0.02060258,
         0.03863397,  0.00372495,  0.00356013,  0.00042853,  0.00214267,
         0.00946071,  0.00520833,  0.00161524,  0.00171414,  0.00504351,
         0.00573576,  0.0069884 ,  0.00445016,  0.01592168,  0.00590058,
         0.00187896,  0.00814214,  0.00095596,  0.00098892,  0.00421941,
         0.00069225,  0.00079114,  0.00125264,  0.02333861,  0.00178006,
         0.06523602,  0.02234968,  0.00095596,  0.01447126,  0.01374604,
         0.01160338,  0.00777954,  0.00240638,  0.01044963,  0.00576872,
         0.01400976]),
 array([ 0.0320741 ,  0.0489847 ,  0.05745649,  0.0069884 ,  0.11349552,
         0.00903217,  0.00497758,  0.00098892,  0.00313159,  0.01420754,
         0.07192774,  0.03359045,  0.00316456,  0.00573576,  0.01054852,
         0.00032964,  0.00510944,  0.01855881,  0.06882911,  0.04608386,
         0.00346123,  0.00491166,  0.00685654,  0.01064742,  0.02060258,
         0.00613133,  0.01466904,  0.02037184,  0.03035997,  0.00171414,
         0.00194488,  0.01592168,  0.01265823,  0.04341377,  0.0581817 ,
         0.11270438,  0.05000659,  0.03022811]),
 array([ 0.03629351,  0.05844541,  0.00171414,  0.00313159,  0.06882911,
         0.04100738,  0.00613133,  0.00692247,  0.10215585,  0.01855881,
         0.00316456,  0.0989913 ,  0.41788634,  0.10492484,  0.0199433 ,
         0.01190005]),
 array([ 0.06882911,  0.07838871,  0.2161788 ,  0.01855881,  0.04100738,
         0.54074367,  0.03629351]),
 array([ 0.06882911,  0.03629351,  0.01855881,  0.87631857]),
 array([ 0.01855881,  0.98144119])]

Check they sum to 1.


In [156]:
[sum(p) for p in priors]


Out[156]:
[0.99999999999999978, 0.99999999999999989, 1.0, 1.0, 1.0, 0.99999999999999978]

In [166]:
new_predictions = [np.zeros(class_predictions.shape) for k in range(len(hier))]
new_predictions[0] = class_predictions


Out[166]:
6

In [177]:
new_predictions[0][0,0]


Out[177]:
0.99938714504241943

In [168]:
# For each array of new predictions
for pred in range(len(new_predictions)):
    # No need to update the zeroth one
    if pred > 0:
        layer = t.TaxonomyLayer(pred)
        # For each image
        for index, row in enumerate(new_predictions[pred]):
            # For each class
            for i, c in enumerate(settings.classes):
                # Find the parent
                j = int(np.where(np.array(hier[pred]) == layer[c])[0])
                new_predictions[pred][] = superclasses[index, j] * (settings.class_priors[i] / priors[j])


0
1
2
3
4
5